#!/usr/bin/env python3
#
#  Process the distribution of the number of journals per subject
#  
#  Copyright 2025 Eko Didik Widianto <didik@live.undip.ac.id>
#  
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 3 of the License, or
#  (at your option) any later version.
#  
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#  
#  You should have received a copy of the GNU General Public License
#  along with this program. If not, see <https://www.gnu.org/licenses/>.
#
import sys
from pathlib import Path    # Path info
import re

import pandas as pd     # Pandas

# Using seabord plotting
import seaborn as sns
import matplotlib.pyplot as plt

import numpy as np

# Path definition
path_abs = Path(__file__).parent.absolute() # Use absolute path
sys.path.append(f"{path_abs}/..") # Adds higher directory to python modules path
in_path = f"{path_abs}/data_master/"
report_path = f'{path_abs}/reporting/'

# Own functions
from util.utils import save_file, open_json_file, save_aprofile


# Data file: Name, master journal data, master publisher data
data_file = [['GARUDA', 'gjournal'],
  ['SINTA', 'sjournal']]

def main():
  
  print("Number of journals per subject")
  print("================")
  
  jprofiles = []
  
  cols = ['id', 'title', 'publisher', 'pissn', 'eissn', 'subj_areas']
  
  # Plots
  fig, ax = plt.subplots(ncols=2, figsize=(8, 5), sharey=True, gridspec_kw={'wspace': 0})
  plt.subplots_adjust(left=0.1, right=0.9, 
    top=0.9, bottom=0.2, 
    wspace=0.1, hspace=0.1)
  
  color_idx = ["#fdae61", "#4575b4"]
  ax[0].set_ylabel('Subjects', fontsize=11)
  
  subj_dfs = pd.DataFrame()
  
  for i, mdata in enumerate(data_file):
    src, jour_src = mdata
    
    jprofile = {}
    pprofile = {}
    data_subjs = {}
    
    jdf = open_json_file(in_path + jour_src + '.json')
    jdf = jdf[jdf['is_active']][cols]  # Filter active journal

    jdf['subj_areas'] = jdf['subj_areas'].fillna('').apply(list)
    jdf['nsubj_areas'] = jdf['subj_areas'].str.len()
    jdf = jdf[jdf['nsubj_areas'] > 0].reset_index(drop=True)
    
    jdf_ex = jdf.explode(['subj_areas'], ignore_index=True)
    
    # Profiling
    jprofile['src'] = jour_src
    jprofile['njournals'] = len(jdf.index)
    jprofile['nsubjects'] = len(jdf_ex.index)
    jprofiles.append(jprofile)
        
    # Prepare data for plot
    subj_df = jdf_ex.groupby(['subj_areas']).agg({'subj_areas':'count'}).rename(columns={'subj_areas':'nsubj_areas'}).reset_index().sort_values(by=['subj_areas'], key=lambda x: x.apply(extract_numeric_key), ascending=False).reset_index(drop=True)
    
    subj_df['nsubj_areasp'] = round(100 * subj_df['nsubj_areas'] / len(jdf_ex.index), 2)
    
    print(subj_df)

    draw_barplot(subj_df, 'nsubj_areas', ax[i], 
      title = src + '\nNj: ' + f"{len(jdf.index):,d}, " + ' Ns: ' + f"{len(jdf_ex.index):,d}",
      color = color_idx[i], xreverse=True if i == 0 else False)

    means = subj_df['nsubj_areasp'].mean()
    means_ = subj_df['nsubj_areas'].mean()
    
    stddev = subj_df['nsubj_areasp'].std()
    stddev_ = subj_df['nsubj_areas'].std()
    
    jprofile['njour_mean'] = f"{round(means_):,d} ({means:.2f} %)"
    jprofile['njour_std'] = f"{round(stddev_):,d} ({stddev:.2f} %)"
        
    subj_dfs = pd.concat([subj_dfs, subj_df], axis=1)
    
  subj_dfs.columns = ['subj_areas', 'nsubj_areas',  'nsubj_areasp', 'subj_areas2', 'nsubj_areas2',  'nsubj_areasp2']
  subj_dfs['comp'] = subj_dfs['nsubj_areasp2'] / subj_dfs['nsubj_areasp'] 
  
  # Add value
  for i, p in enumerate(ax[1].patches):
    _y = p.get_y()+ p.get_height() / 4
    val = subj_dfs.at[i, 'comp']
    ax[1].text(0, _y + 0.3, f'{val:.2f}', ha='left', color='white')
  
  fig.supxlabel('Number of journals (%)', fontsize=11, y = 0.08)  # Common x label
    
  save_aprofile(jprofiles, report_path + "01-jour_subjs", src="records", desc="01. Number of journal by subject count")
  plt.tight_layout()
  plt.show()

# Function to extract and convert the numerical part of the string
def extract_numeric_key(s):
    match = re.search(r'\d+', s) # Find one or more digits
    if match:
        return int(match.group(0)) # Convert the found digits to an integer
    return 0 # Handle cases where no digits are found (e.g., return a default value)

def draw_barplot(df, col, ax, title, color, xreverse=False):
  ax.spines['top'].set_visible(False)
  ax.spines['right'].set_visible(False)
  
  ax.set_xlabel(' ')
  ax.margins(x=0.3)
  
  # Add title
  ax.set_title(title, fontsize=10, y = 1.05)
  
  sns.barplot(data=df, 
    x = col+'p', y = df['subj_areas'], ax=ax,
    dodge = True, color = color, 
    width = 0.85,
    orient = 'horizontal')
      
  # Add value
  for i, p in enumerate(ax.patches):
    _y = p.get_y()+ p.get_height() / 4
    _x = p.get_width() 
    val = int(df.at[i, col])
    ax.text(_x + 1, _y + 0.3, f'{val:,d}', ha='right' if xreverse else 'left')
  
  # Add mean lines and mean - stddev
  means = df['nsubj_areasp'].mean()
  means_ = df['nsubj_areas'].mean()
  
  stddev = df['nsubj_areasp'].std()
  stddev_ = df['nsubj_areas'].std()
  
  mstd = means - stddev
  mstd_ = means_ - stddev_
  
  ymin, ymax = plt.ylim()
  ax.axvline(x=means,color='red', linestyle='--', label='Mean', linewidth=1)
  ax.text(means, ymax, '\u03BC: ' + f'{round(means_):,d}', ha='center')
  
  ax.axvline(x=mstd,color='blue', linestyle='--', label='Mean', linewidth=0.5)
  ax.text(mstd, ymax, '\u03BC' + '-' + '\u03C3: ' + f'{round(mstd_):,d}', ha='center')
  
  if xreverse:
    ax.invert_xaxis() #  Revert direction
  else:
    ax.spines['left'].set_visible(False)
  
# Run the script
if __name__ == '__main__':
  main()
